# china-stm.R
# Structural topic modeling of open-ended survey responses from Xi'an/Chengdu 2015 survey
# Endre Tvinnereim, November 2015
# Background: http://structuraltopicmodel.com/ ; _vignette_ "stm: R Package for Structural Topic Models"

library(devtools)
library(R.methodsS3)
library(stm)
library(tm, SnowballC)
library(translateR)

# Clear all
rm(list=ls(all=TRUE))
# read data already processed by file "chinopen-prep.R" 
setwd("O:/2.45-china-exp/2-china-STM/data")
load("cn_alldata.RData")
length(cn.alldata$openend) # Length is 1203
# Subset data set
myvars <- c("ID", "openend", "proc.oa", "proc.ctb", 
            "OpenEnd", "Site", "Happen", "Cause", "Concern", "Gender", 
            "Age", "Income", "Educ",  "Race", "climchg", "airpol")
myvars2 <- c("ID", "openend", "proc.oa", "proc.ctb", 
            "OpenEnd", "Site", "Gender", 
            "Age",  "Race", "climchg", "airpol")  
# Note: "openend" = original transcripts; "proc.oa"=processed transcripts (segmented, stop words
# and punctuation removed; "OpenEnd"=English translation by WI students (which=automated translation.)
mydata <- cn.alldata[myvars]
rm(myvars2)
# Recall: proc.oa is the text vector for analysis

# remove missing data
data <- na.omit(mydata) # thanks to http://www.statmethods.net/input/missingdata.html
# How much is lost due to listwise deletion? 
length(data$openend)
# Answer: 78 entries, or 89 with the full set of data. 

#stemming/stopword removal, etc.
processed <- textProcessor(data$proc.oa, 
                           metadata=data, 
                           removestopwords=FALSE,
                           removenumbers=TRUE,
                           removepunctuation=TRUE,
                           stem=FALSE,
                           sparselevel=1, language="na",
                           wordLengths=c(1,Inf),
                           verbose=TRUE)

#structure and index for usage in the stm model. Verify no-missingness.
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh=1)
#output will have object meta, documents, and vocab
docs <- out$documents
vocab <- out$vocab
meta <-out$meta

# List the unique 266 terms here
UniqueTerms266 <- out$vocab
save(UniqueTerms266, file="UniqueTerms266.txt")
rm(UniqueTerms266)

# save workspace before model runs
save(list = ls(all = TRUE), file = "preparedData-china-before-stm.RData")


###########
# RUN STM #
###########
setwd("O:/2.45-china-exp/2-china-STM/manySTM")
# k=10
  chinSelect <- selectModel(out$documents,
                            out$vocab,
                            K=10,
                            prevalence =~ meta$Gender+meta$Age+meta$Income+meta$Educ, 
                            max.em.its=500, 
                            data=out$meta,
                            runs=20,
                            seed=1234) #, emtol=1)
save(chinSelect, docs, vocab, meta, file="Models-2015-12-22-k10.Rdata")


# multiple k
setwd("O:/2.45-china-exp/2-china-STM/manySTM")
# Select the 4 best models out of 20 runs (see Vignette, pp. 8-9)
for (k in 3:10) {chinSelect <- selectModel(out$documents,
                           out$vocab,
                           K=k,
                           prevalence =~ meta$Gender+meta$Age+meta$Income+meta$Educ, 
                           max.em.its=500, 
                           data=out$meta,
                           runs=20,
                           seed=1234) #, emtol=1)
                filename <- paste("preparedModels-china-STM-" , as.character(k), 
                                  ".RData", sep="_", collapse=NULL)
                save(chinSelect, docs, vocab, meta, file = filename)
                }

# End STM run. Now go to analysis files to evaluate these runs and make tables/graphs


